#Leah Downey 
#April 27, 2016
#code for rescuing intutitions 

#First load in replication data from Kelly-Enns 2010, call it x 
#then load data for trust in government, call it trust.of.government 
#load in trust of government data 
require(foreign)
require(lmtest)

#load Kelly-Enns 2010 data 
#load trust in goverment data 

#start by cleaning and formatting the trust data 
years <- c(1958,1964,1966,1968,1970,1972,1974,1976:1980,1982, 1984:2015)
trust <- rep(NA, length(unique(trust.of.government$year)))

#create a matrix for the trust data 
trust <- matrix(1952:2006,55,1)
trust <-cbind(trust, NA)

#add a trust column to the data set 
x <- cbind(x, NA)
names(x)[11]<-"trust"

#write a loop that finds the years in x where we have a trust variable and put it in the data matrix  
for(i in 1:55 ){
  location<- c(which(trust[i,1]==trust.of.government$year))
  trust[i,2] <- mean(c(trust.of.government$moving.avg[location]))}


for( i in 1:45){
  spot <- which(x$year==trust[i,1])
  x$trust[spot] <- trust[spot,2]
}




diff_mood<-diff(x$mood)
diff_policy <-diff(x$policy)
diff_unemployment <-diff(x$unemployment)
diff_inflation <- diff(x$inflation)
diff_gini <- diff(x$gini)
diff_trust <- diff(x$trust)
diff_mood_low <- diff(x$mood_lowinc)
diff_mood_high <- diff(x$mood_highinc)



lag_mood <- na.omit(lag(x$mood))[-55]
lag_policy <- na.omit(lag(x$policy))[-55]
lag_unemployment <- na.omit(lag(x$unemployment))[-55]
lag_inflation <-na.omit(lag(x$inflation))[-55]
lag_gini <- na.omit(lag(x$gini))[-55]
lag_trust <- lag(x$trust)[-55]
lag_mood_low <-lag(x$mood_lowinc)[-55]
lag_mood_high <-lag(x$mood_highinc)[-55]



#create a data frame with all relevant variables  
data = data.frame(diff_mood, lag_mood, diff_policy, lag_policy, 
                  diff_unemployment, lag_unemployment, diff_inflation, 
                  lag_inflation, diff_gini, lag_gini, 
                  diff_trust, lag_trust, diff_mood_high, lag_mood_high,
                  diff_mood_low, lag_mood_low)



#drop out the ends of the data where I am lacking observations
#only keep set of observations with max one year between them 
data.clean <-data[-c(1:12,46:54),]

to.drop <-which(data.clean$lag_trust=="NaN")
#33 obs left 

data.clean<-data.clean[-c(to.drop),]
#25 obs left 

#The first model, using all the observations, it has difference in public mood as a dependent variable
#and covariates are: lag_mood, lag_gini, lag_inflation, lag_unemployment, and lag_trust in government 
#use regular OLS regression as they do in Kelly-Enns

co_variates_trust <- as.matrix(data.clean[,c(2,6,8,10,12)]) 
dpt_var_trust <- data.clean[,1]
trust.results <-lm(dpt_var_trust ~ co_variates_trust ) 
summary(trust.results)

bgtest(trust.results, order = 1, order.by = NULL, type = c("Chisq", "F"), data = list(), fill = 0)
#recall Ho for this test is that there is no serial correlation, so a small p-value, as the 
#one we get here, would mean we can reject Ho, implying there is likely serial correlation
#as a result, we employ the method in Kelly-Enns 2010, prais winsten 

#load external function that does prais.winsten in R 
prais.winsten <-
  function(formula,data,iter=50,rho=0,tol=.00000001){
    mod<-model.frame(formula,data=data)
    lm<-lm(mod)
    n<-length(mod[,1])
    list.rho<-c(0)
    imax<-ncol(mod)-1
    fo<-as.formula(paste("y ~ -1 + x0 +",paste(paste0("x",1:imax), collapse= "+")))
    # In case rho is known
    if (rho!=0) {
      y<-c((1-rho^2)^(1/2)*mod[1,1],mod[2:n,1]-rho*mod[1:(n-1),1])
      x0<-c((1-rho^2)^(1/2),rep(1-rho,n-1))
      for (i in 1:imax) {
        x<-c((1-rho^2)^(1/2)*mod[1,(i+1)],mod[2:n,(i+1)]-rho*mod[1:(n-1),(i+1)])
        assign(paste("x",i,sep=""),x)
      }
      lm<-lm(fo)
      j<-1
      rho.tstat<-"none"
    }
    # Include the estimation of rho (usual case)
    else {
      res<-lm$res
      res_1<-c(NA,res[-n])
      for (i in 1:iter){
        rho.lm<-lm(res ~ res_1 -1)
        rho<-rho.lm$coeff[1]
        if (abs(rho-tail(list.rho,n=1))<tol) {
          j<-i
          break
        }
        else {
          list.rho<-append(list.rho,rho)
          y<-c((1-rho^2)^(1/2)*mod[1,1],mod[2:n,1]-rho*mod[1:(n-1),1])
          x0<-c((1-rho^2)^(1/2),rep(1-rho,n-1))
          for (k in 1:imax) {
            x<-c((1-rho^2)^(1/2)*mod[1,(k+1)],mod[2:n,(k+1)]-rho*mod[1:(n-1),(k+1)])
            assign(paste("x",k,sep=""),x)
          }
          lm<-lm(fo)
          fit<-as.vector(rep(lm$coef[1],n))+as.vector(as.matrix(mod[,2:(imax+1)]) %*% lm$coef[2:(imax+1)])
          res<-mod[,1]-fit
          res_1<-c(NA,res[-n])
          j<-i
          rho.tstat<-summary(rho.lm)$coef[1,3]
        }
      }
    }
    if (iter==50) i<-j-1 else i<-j
    attr(lm$coefficients,"names")<-c('Intercept',names(mod)[2:ncol(mod)])
    s<-summary(lm)
    r<-data.frame('Rho'=rho,'Rho.t-statistic'=rho.tstat,'Iterations'=i,row.names=c(''))
    results<-list(s,r)
    return(results)
  }

#The following is the same as the model above, except it assumes an AR(1) process 
trust_prais <- prais.winsten(diff_mood   ~ lag_mood+lag_gini+lag_inflation+lag_unemployment+lag_trust, data=data.clean,iter=50,rho=0,tol=.00000001)
trust_prais

#repeat the same modeling procedure for the low income case
#the only change here is the dependent variable and the lagged mood covariate 

co_variates_low <- as.matrix(data.clean[,c(16,6,8,10,12)])
dpt_var_low <- data.clean[,15]
low_trust <-lm(dpt_var_low~co_variates_low)
summary(low_trust)


bgtest(low_trust, order = 1, order.by = NULL, type = c("Chisq", "F"), data = list(), fill = 0)
#again we find indications of an AR(1)

low_trust_prais <- prais.winsten(diff_mood_low   ~ lag_mood_low+lag_gini+lag_inflation+lag_unemployment+lag_trust, data=data.clean,iter=50,rho=0,tol=.00000001)
low_trust_prais

#repeat again for the high income case 
co_variates_high <- as.matrix(data.clean[,c(14,6,8,10,12)])
dpt_var_high <- data.clean[,13]
high_trust <-lm(dpt_var_high~co_variates_high)
summary(high_trust)

bgtest(high_trust, order = 1, order.by = NULL, type = c("Chisq", "F"), data = list(), fill = 0)
#this case seems not to show signs of AR(1)

#below we run the prais-winsten model, which does no harm and shows that the resuts still hold 
high_trust_prais <- prais.winsten(diff_mood_high  ~ lag_mood_high+lag_gini+lag_inflation+lag_unemployment+lag_trust, data=data.clean,iter=50,rho=0,tol=.00000001)
high_trust_prais


#Figure 2, plot of trust over time 
plot(trust[,1], trust[,2], xlab="Year", ylab="Trust in Government", main="Trust in Government since 1958" )

#Figure 3, simulations of Kelly-Enns 2010 model with high and low gini coefficients 
#reload the Kelly-Enns data and call it x
require(scales)
#make the variables 

diff_mood<-diff(x$mood) 
diff_policy <-diff(x$policy)
diff_unemployment <-diff(x$unemployment)
diff_inflation <- diff(x$inflation)
diff_gini <- diff(x$gini)



lag_mood <- na.omit(lag(x$mood))[-55]
lag_policy <- na.omit(lag(x$policy))[-55]
lag_unemployment <- na.omit(lag(x$unemployment))[-55]
lag_inflation <-na.omit(lag(x$inflation))[-55]
lag_gini <- na.omit(lag(x$gini))[-55]



#create a data frame with all relevant variables  
data = data.frame(diff_mood, lag_mood, diff_policy, lag_policy, 
                  diff_unemployment, lag_unemployment, diff_inflation, 
                  lag_inflation,  diff_gini, lag_gini)

#replicate their model results, from Table 1 column 3 
co_variates_1.3 <- as.matrix(data[,2:10])
dpt_var_1.3 <- data[,1]
table1_col3 <- lm(dpt_var_1.3 ~ co_variates_1.3 ) 
summary(table1_col3)
table1_col3$coeff
vcov_tab3 <- vcov(table1_col3 )

#calculate 1000 simulated betas, using their estimated coefficients and variance-covariance matrix 
library(mvtnorm)
set.seed(8352)
sim_betas<- rmvnorm(n=1000, mean=table1_col3$coeff, sigma=vcov_tab3)


#Simulations
n<- 1000
#number of simualtions is n, I used 1000 to make the graph in the paper 

#create the covariate matricies for the simulations, one that uses the observational data, but replaces 
#all the gini coefficients with the lowest observed value, and the other that does the same with the highest
covs.high<-co_variates_1.3
covs.high[,9]<-0.44
covs.high<-cbind(1,covs.high)
covs.low<-co_variates_1.3
covs.low[,9]<-0.35
covs.low<-cbind(1,covs.low)


#t is a value that controls the transparency of the graphs lines, so you can see 20000 in one place 
t <- 0.06

#Start with the case of the low gini measure 
#make empty matricies for the simulations and errors, the number of rows is 
#the number of observations in the time series, and then each column is for a different simulation
simulations.low <- matrix(NA, nrow=54, ncol=n)
errors.low <- matrix(NA, nrow=254, ncol=n)
A <- as.matrix(c(1,co_variates_1.3[1,]))

#calculate a set of error terms, using the estimated variance from their model 
wns<-rnorm(mean=0, n=54, sd=summary(table1_col3)$sigma) 

#pick out the first row of simulated betas to use first 
sim_betas_use<- as.matrix(sim_betas[1,])

#fill in the first simulation's first observation, using the first simulated betas, 
#the first row of covariates and the first simulated error 
simulations.low[1,1]<-t(sim_betas_use)%*%as.matrix(covs.low[1,])+wns[1]


#run a loop for the rest of the first simulation, completing the 2-54 observations
#using the same simulated betas, the relevant covariate values, and teh relevant error term 
for(j in 2:54){
  simulations.low[j,1]<-t(sim_betas_use)%*%as.matrix(covs.low[j,])+wns[j]
}

time <- 1953:2006
#plot the first simulation
plot(time, simulations.low[,1], type="l", col=alpha("red", t), ylim=c(-10,10), xlab="Time", ylab="Difference in Mood", main="Kelly-Enns 2010")

#then run a loop for the remaining 999 simulations, doing the same thing, adding each one to the graph as you go along 
for(i in 2:n){
  sim_betas_use<- as.matrix(sim_betas[i,])
  simulations.low[1,i]<-t(sim_betas_use)%*%as.matrix(covs.low[1,])+wns[1]
  
  for(j in 2:54){
    simulations.low[j,i]<-t(sim_betas_use)%*%as.matrix(covs.low[j,])+wns[j]
  }
  lines(time, simulations.low[,i],  col=alpha("red", t))
}

#repeat the exact same process with the "high" gini covariate matrix
#this time simply add these simulations to the same plot, but in a different color 
#high
simulations.high <- matrix(NA, nrow=54, ncol=n)
errors.high<- matrix(NA, nrow=54, ncol=n)
A <- as.matrix(c(1,co_variates_1.3[1,]))
wns<-rnorm(mean=0, n=54, sd=summary(table1_col3)$sigma) 

sim_betas_use<- as.matrix(sim_betas[1,])
simulations.high[1,1]<-t(sim_betas_use)%*%as.matrix(covs.high[1,])+wns[1]

for(j in 2:54){
  simulations.high[j,1]<-t(sim_betas_use)%*%as.matrix(covs.high[j,])+wns[j]
}

for(i in 2:n){
  sim_betas_use<- as.matrix(sim_betas[i,])
  simulations.high[1,i]<-t(sim_betas_use)%*%as.matrix(covs.high[1,])+wns[1]
  
  for(j in 2:54){
    simulations.high[j,i]<-t(sim_betas_use)%*%as.matrix(covs.high[j,])+wns[j]
  }
  lines(time, simulations.high[,i],  col=alpha("green", t))
}


#Do the same thing, but for the alternative model, offered in the paper 
#start by calculating the model using the same covariates and dependent variates
#but with a MLE AR(1) model instead of the prais.winsten used above. I did this 
#simply because it makes simulation easier, and as you can see, has almost identical results
#to the prais winsten calculation 

trust_MLE <- arima(dpt_var_trust, order= c(1, 0, 0), xreg=co_variates_trust)
trust_MLE

#seperate out the rho and beta estimated coefficients from the model
par <-coef(trust_MLE)
betas<- coef(trust_MLE)[2:7]
rho <-coef(trust_MLE)[1]

#as well as teh estimated variance 
vcov_trust <- vcov(trust_MLE)

n<-1000
t<- 0.06
#simulate parameters, rho and betas, based on the estimated values and variances 
library(mvtnorm)
set.seed(8352)
sim_pars <- rmvnorm(n=n, mean=par, sigma=vcov_trust)
sim_betas<-sim_pars[,-1]
sim_rhos<-sim_pars[,1]

#again form covariance matricies with the observed values, replacing all the gini lags with the highest 
#and lowest observed values 
covs.high<-co_variates_trust
covs.high[,4]<-0.44
covs.high<-cbind(1,covs.high)
covs.low<-co_variates_trust
covs.low[,4]<-0.35
covs.low<-cbind(1,covs.low)


#start with the low gini value case, follow mostly the same procedure as above
simulations.low <- matrix(NA, nrow=25, ncol=n)
errors.low <- matrix(NA, nrow=25, ncol=n)
#in this case, because of the AR process, we must use a starting value for the first 
#error, I use the model's residual, A below, the set of the first observed covariates, is necessary 
#to calulate the residual 
A <- as.matrix(c(1,co_variates_trust[1,]))
#again, simulated error terms 
wns<-rnorm(mean=0, n=25, sd=sqrt(trust_MLE$sigma2)) 

#the first error, is the starting error, which uses the residual 
errors.low[1,1] <- sim_rhos[1]*(data.clean[1,1]-(t(A)%*%sim_betas[1,]))+wns[1]

#use the first row of simulated betas for the first simulation
sim_betas_use<- as.matrix(sim_betas[1,])

#start the first simulation 
simulations.low[1,1]<-t(sim_betas_use)%*%as.matrix(covs.low[1,])+errors.low[1,1]

#write a loop to finish the first simulation, based on an AR(1) model here 
for(j in 2:25){
  error_use<-sim_rhos[1]*errors.low[j-1,1]+wns[j]
  simulations.low[j,1]<-t(sim_betas_use)%*%as.matrix(covs.low[j,])+error_use
  errors.low[j,1]<-error_use
}
#plot the first simulation 
x.axis<-x$year[x$trust!="NaN"]
x.axis<-c(x.axis[1:25])

plot(x.axis, simulations.low[,1], type="l", col=alpha("red", t), ylim=c(-10,10), xlab="Time", ylab="Difference in Mood", main="Alternative Model")

#write a loop that goes through the same exact process for the remaining 999 simulations plotting them along the way 
for(i in 2:n){
  errors.low[1,i] <- sim_rhos[i]*(data.clean[1,1]-(t(A)%*%sim_betas[i,]))+wns[1]
  sim_betas_use<- as.matrix(sim_betas[i,])
  simulations.low[1,i]<-t(sim_betas_use)%*%as.matrix(covs.low[1,])+errors.low[1,i]
  
  for(j in 2:25){
    error_use<-sim_rhos[1]*errors.low[j-1,i]+wns[j]
    simulations.low[j,i]<-t(sim_betas_use)%*%as.matrix(covs.low[j,])+error_use
    errors.low[j,i]<-error_use
  }
  lines(x.axis, simulations.low[,i],  col=alpha("red", t))
}


#repeat exactly the same process for the high gini coefficent case 
simulations.high <- matrix(NA, nrow=25, ncol=n)
errors.high<- matrix(NA, nrow=25, ncol=n)
A <- as.matrix(c(1,co_variates_trust[1,]))
wns<-rnorm(mean=0, n=25, sd=sqrt(trust_MLE$sigma2)) 

errors.high[1,1] <- sim_rhos[1]*(data.clean[1,1]-(t(A)%*%sim_betas[1,]))+wns[1]
sim_betas_use<- as.matrix(sim_betas[1,])
simulations.high[1,1]<-t(sim_betas_use)%*%as.matrix(covs.high[1,])+errors.low[1,1]

for(j in 2:25){
  error_use<-sim_rhos[1]*errors.high[j-1,1]+wns[j]
  simulations.high[j,1]<-t(sim_betas_use)%*%as.matrix(covs.high[j,])+error_use
  errors.high[j,1]<-error_use
}

for(i in 2:n){
  errors.high[1,i] <- sim_rhos[i]*(data.clean[1,1]-(t(A)%*%sim_betas[i,]))+wns[1]
  sim_betas_use<- as.matrix(sim_betas[i,])
  simulations.high[1,i]<-t(sim_betas_use)%*%as.matrix(covs.high[1,])+errors.low[1,i]
  
  for(j in 2:25){
    error_use<-sim_rhos[1]*errors.high[j-1,i]+wns[j]
    simulations.high[j,i]<-t(sim_betas_use)%*%as.matrix(covs.high[j,])+error_use
    errors.high[j,i]<-error_use
  }
  lines(x.axis, simulations.high[,i],  col=alpha("green", t))
}


#preferene for govenremnt reduction of ineuqality with trust 

#load Kelly-Enns 2010 data 
#load redistribution preference data 
dim(redist.prefs)

#re-code the preference data 
unique(redist.prefs$Should.govt.reduce.income.differences) #what are the levels
#Not applicable   2                5                4                3                6                No govt action  
# Govt reduce diff Don't know       No answer     

levels(redist.prefs$Should.govt.reduce.income.differences)
levels(redist.prefs$Should.govt.reduce.income.differences)<-c(NA, 2, 3, 4, 5, 6, NA, 1, NA, 7, NA)

unique(redist.prefs$Should.govt.reduce.income.differences)
#<NA> 2    5    4    3    6    7    1   
#Levels: 2 3 4 5 6 1 7

#how many do we have of each? to check myself later 
one <- sum(na.omit(redist.prefs$Should.govt.reduce.income.differences==1))
two <- sum(na.omit(redist.prefs$Should.govt.reduce.income.differences==2))
three <- sum(na.omit(redist.prefs$Should.govt.reduce.income.differences==3))
four <- sum(na.omit(redist.prefs$Should.govt.reduce.income.differences==4))
five <- sum(na.omit(redist.prefs$Should.govt.reduce.income.differences==5))
six <- sum(na.omit(redist.prefs$Should.govt.reduce.income.differences==6))
seven <- sum(na.omit(redist.prefs$Should.govt.reduce.income.differences==7))
NAs <-  sum(is.na(redist.prefs$Should.govt.reduce.income.differences))


# get rid of NAs 
redist.prefs.short <- na.omit(redist.prefs)
length(unique(redist.prefs.short$Gss.year.for.this.respondent))

#Take the mean of all observatons I have for each year 
means_pref <-matrix(unique(redist.prefs.short$Gss.year.for.this.respondent), nrow=22, ncol=1)
means_pref <- cbind(means_pref, NA)

for(i in 1:22 ){
  location<- c(which(means_pref[i,1]==redist.prefs.short$Gss.year.for.this.respondent))
  means_pref[i,2] <- mean(c(redist.prefs.short$Should.govt.reduce.income.differences[location]))}

means_pref <- as.matrix(means_pref)
names(means_pref)<-c("year", "prefs")


#add the Trust variable, in same way as above 
require(foreign)
#load trust in government data 
years <- c(1958,1964,1966,1968,1970,1972,1974,1976:1980,1982, 1984:2015)


trust <- matrix(1952:2006,55,1)
trust <-cbind(trust, NA)

x <- cbind(x, NA)
names(x)[11]<-"trust"


for(i in 1:55 ){
  location<- c(which(trust[i,1]==trust.of.government$year))
  trust[i,2] <- mean(c(trust.of.government$moving.avg[location]))}


for( i in 1:45){
  spot <- which(x$year==trust[i,1])
  x$trust[spot] <- trust[spot,2]
}

x<-x[,-c(7,10)]

#drop years we don't have 
x <- x[c(27:55),]
x<- x[-c(2,4,5,8,15,18,20,22,24,26,28,30,32,34,36),]
prefs <-type.convert(means_pref[1:18,2])
x$prefs <- prefs

x <-na.omit(x)

#create variables 
diff_mood<-diff(x$mood)
diff_policy <-diff(x$policy)
diff_unemployment <-diff(x$unemployment)
diff_inflation <- diff(x$inflation)
diff_gini <- diff(x$gini)
diff_trust <- diff(x$trust)
diff_mood_low <- diff(x$mood_lowinc)
diff_mood_high <- diff(x$mood_highinc)
diff_prefs <-diff(x$prefs)

lag_mood <- na.omit(lag(x$mood))[-12]
lag_policy <- na.omit(lag(x$policy))[-12]
lag_unemployment <- na.omit(lag(x$unemployment))[-12]
lag_inflation <-na.omit(lag(x$inflation))[-12]
lag_gini <- na.omit(lag(x$gini))[-12]
lag_trust <- lag(x$trust)[-12]
lag_mood_low <-lag(x$mood_lowinc)[-12]
lag_mood_high <-lag(x$mood_highinc)[-12]
lag_prefs <-na.omit(lag(x$prefs))[-12]

#create a data frame with all relevant variables  
data = data.frame(diff_mood, lag_mood, diff_policy, lag_policy, 
                  diff_unemployment, lag_unemployment, diff_inflation, 
                  lag_inflation, diff_gini, lag_gini, 
                  diff_trust, lag_trust, diff_mood_high, lag_mood_high,
                  diff_mood_low, lag_mood_low, diff_prefs, lag_prefs)


#all population, same model, except with preferences as dpt variable instead of public mood 
co_variates <- as.matrix(data[,c(18,6,8,10,12)]) #which variables in this model 
dpt_var <- data[,17]
results <-lm(dpt_var ~ co_variates) 
summary(results)

bgtest(results, order = 1, order.by = NULL, type = c("Chisq", "F"), data = list(), fill = 0)


#do this with prais.winsten 
results_gls <- prais.winsten(diff_prefs   ~ lag_prefs+lag_gini+lag_inflation+lag_unemployment+lag_trust, data=data,iter=50,rho=0,tol=.00000001)
results_gls

















